In [1]:
%load_ext autoreload
%autoreload 2

From a Twitter Username and publication titles, create a similarity measure

Example user:

Paul Groth


In [2]:
username = 'pgroth'

Publication titles found through Google Scholar


In [102]:
import json
publications_titles = json.load(open('paul_pubs.json'))

In [103]:
from gensim import similarities, models, corpora, utils
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

In [104]:
import re
re_split = re.compile('\W+')

Read in all publications titles and tokenize them, also removing all stopwords and words that only occur once.


In [114]:
texts = [ [word for word in re.split(re_split,pub.lower()) if word not in stoplist and word != '']   for pub in publication_titles]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]   for text in texts]

Create a dictionary based on these texts, a 'bag-of-words', where every token in the dictionary gets an id. Also an corpus based on this dictionary.


In [115]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/paul.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

Create an Latent Similarity Indexing Model


In [123]:
lsi = models.LsiModel(corpus, id2word = dictionary, num_topics=10)

From this model we can create an similarity measure on new sentences, to which topic they belong. I hope we can also find a way to compare an entire twitter feed to this model, and say how similar it is to this corpus.


In [124]:
index = similarities.MatrixSimilarity(lsi[corpus])
index.save('/tmp/paul.index')

In [125]:
vec_bow = dictionary.doc2bow('New sentence containing the words Provenance Query experiments chemistry data'.lower().split())
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]
sims = enumerate(sims)
sims = sorted(sims, key=lambda item: -item[1])
print sims


[(29, 0.98630798), (28, 0.9691515), (4, 0.93104374), (70, 0.90211278), (43, 0.77245337), (76, 0.76607108), (62, 0.76288593), (41, 0.73851448), (49, 0.73851448), (59, 0.73218888), (97, 0.72368819), (83, 0.70841205), (45, 0.70381987), (98, 0.69535011), (93, 0.69272143), (40, 0.69191432), (82, 0.68780637), (14, 0.68097329), (1, 0.67749143), (21, 0.65240085), (69, 0.65220439), (71, 0.64739621), (86, 0.64215934), (89, 0.63160402), (42, 0.63137156), (2, 0.62525499), (68, 0.61808687), (95, 0.61705238), (52, 0.60956055), (9, 0.60694212), (17, 0.59847367), (34, 0.59830147), (53, 0.5957135), (19, 0.59241885), (30, 0.59241885), (65, 0.59232157), (13, 0.59006155), (58, 0.58124965), (55, 0.5782932), (39, 0.57802844), (90, 0.56224394), (96, 0.5543443), (24, 0.53736526), (5, 0.53297108), (61, 0.53297108), (60, 0.53241158), (74, 0.51870018), (91, 0.51433152), (6, 0.50031388), (63, 0.49763128), (66, 0.4930937), (27, 0.49132413), (12, 0.48040721), (23, 0.4716045), (37, 0.44861746), (81, 0.43076739), (31, 0.42521673), (51, 0.38576302), (11, 0.38449615), (35, 0.37379849), (0, 0.36039618), (67, 0.33857304), (20, 0.33734435), (92, 0.33734435), (38, 0.3365238), (44, 0.26474395), (87, 0.25985718), (25, 0.17381135), (99, 0.15976539), (77, 0.1567848), (79, 0.1547946), (80, 0.14281552), (73, 0.13931268), (72, 0.12244276), (85, 0.10450181), (26, 0.10249986), (36, 0.061227877), (3, 0.057882704), (57, 0.052549478), (22, 0.044505171), (64, 0.043574587), (75, 0.037561137), (18, 0.0069109146), (32, 0.0023760106), (48, 0.0020793215), (7, 0.0), (78, 0.0), (15, -0.0019571483), (84, -0.010152709), (56, -0.010488324), (46, -0.015355438), (10, -0.021925382), (94, -0.03057088), (88, -0.054789431), (16, -0.055076569), (47, -0.055378556), (50, -0.070549972), (8, -0.099939071), (33, -0.099939071), (54, -0.13369395)]

In [ ]: